The dataset this week comes from Kaggle and contains all Scooby-Doo episodes. More info about Scooby Doo can be found on ScoobyPedia.

1 Setup

pacman::p_load(
  tidytuesdayR,
  tidyverse,
  lubridate,
  magrittr,
  glue,
  skimr,
  here
)

2 Raw Data

Get data and write to local file

tt_load(2021, week = 29) %>% 
  pluck("scoobydoo") %>% 
  write_csv2(glue("{here()}/data.csv"))

Read data from local file (d_raw)

d_raw <- read_csv2(
  glue("{here()}/data.csv"),
  col_types = cols(.default = "c"),
  na = c("NA", "NULL")
)

Create working copy (d) and sort columns alphabetically

d <- d_raw %>% select(sort(colnames(.)))

3 Inspection

d
glimpse(d)
## Rows: 603
## Columns: 75
## $ and_that                 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "dog"…
## $ another_mystery          <chr> "1", "0", "0", "0", "1", "0", "0", "0", "0", …
## $ arrested                 <chr> "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE…
## $ batman                   <chr> "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", …
## $ blue_falcon              <chr> "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", …
## $ captured_daphnie         <chr> "FALSE", "TRUE", "FALSE", "FALSE", "FALSE", "…
## $ captured_fred            <chr> "FALSE", "TRUE", "FALSE", "FALSE", "FALSE", "…
## $ captured_scooby          <chr> "FALSE", "FALSE", "FALSE", "FALSE", "TRUE", "…
## $ captured_shaggy          <chr> "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", …
## $ captured_velma           <chr> "FALSE", "TRUE", "FALSE", "FALSE", "FALSE", "…
## $ caught_daphnie           <chr> "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", …
## $ caught_fred              <chr> "FALSE", "FALSE", "FALSE", "TRUE", "FALSE", "…
## $ caught_not               <chr> "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", …
## $ caught_other             <chr> "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", …
## $ caught_scooby            <chr> "TRUE", "FALSE", "TRUE", "FALSE", "TRUE", "FA…
## $ caught_shaggy            <chr> "TRUE", "TRUE", "FALSE", "FALSE", "FALSE", "F…
## $ caught_velma             <chr> "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", …
## $ culprit_amount           <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1", …
## $ culprit_gender           <chr> "Male", "Male", "Male", "Male", "Male", "Male…
## $ culprit_name             <chr> "Mr. Wickles", "Cptn. Cuttler", "Bluestone th…
## $ daphnie_va               <chr> "Stefanianna Christopherson", "Stefanianna Ch…
## $ date_aired               <chr> "1969-09-13", "1969-09-20", "1969-09-27", "19…
## $ door_gag                 <chr> "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", …
## $ engagement               <chr> "556", "479", "455", "426", "391", "384", "35…
## $ format                   <chr> "TV Series", "TV Series", "TV Series", "TV Se…
## $ fred_va                  <chr> "Frank Welker", "Frank Welker", "Frank Welker…
## $ groovy                   <chr> "0", "0", "2", "1", "0", "0", "1", "0", "0", …
## $ hex_girls                <chr> "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", …
## $ if_it_wasnt_for          <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "thes…
## $ imdb                     <chr> "8.1", "8.1", "8", "7.8", "7.5", "8.4", "7.6"…
## $ index                    <chr> "1", "2", "3", "4", "5", "6", "7", "8", "9", …
## $ jeepers                  <chr> "0", "0", "0", "0", "0", "1", "0", "0", "0", …
## $ jinkies                  <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", …
## $ just_about_wrapped_up    <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", …
## $ monster_amount           <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1", …
## $ monster_gender           <chr> "Male", "Male", "Male", "Male", "Female", "Ma…
## $ monster_name             <chr> "Black Knight", "Ghost of Cptn. Cuttler", "Ph…
## $ monster_real             <chr> "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", …
## $ monster_species          <chr> "Object", "Human", "Human", "Human", "Human",…
## $ monster_subtype          <chr> "Suit", "Suit", "Phantom", "Miner", "Witch Do…
## $ monster_type             <chr> "Possessed Object", "Ghost", "Ghost", "Ancien…
## $ motive                   <chr> "Theft", "Theft", "Treasure", "Natural Resour…
## $ my_glasses               <chr> "1", "0", "0", "0", "1", "0", "0", "1", "0", …
## $ network                  <chr> "CBS", "CBS", "CBS", "CBS", "CBS", "CBS", "CB…
## $ non_suspect              <chr> "FALSE", "TRUE", "TRUE", "FALSE", "FALSE", "F…
## $ number_of_snacks         <chr> "2", "1", "3", "2", "2", "4", "4", "0", "1", …
## $ rooby_rooby_roo          <chr> "1", "0", "0", "0", "0", "1", "1", "1", "1", …
## $ run_time                 <chr> "21", "22", "21", "21", "21", "21", "21", "21…
## $ scooby_doo_where_are_you <chr> "0", "1", "0", "0", "1", "0", "0", "1", "0", …
## $ scooby_dum               <chr> "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", …
## $ scooby_va                <chr> "Don Messick", "Don Messick", "Don Messick", …
## $ scrappy_doo              <chr> "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", …
## $ season                   <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1", …
## $ series_name              <chr> "Scooby Doo, Where Are You!", "Scooby Doo, Wh…
## $ set_a_trap               <chr> "0", "0", "0", "0", "0", "0", "1", "1", "0", …
## $ setting_country_state    <chr> "United States", "United States", "United Sta…
## $ setting_terrain          <chr> "Urban", "Coast", "Island", "Cave", "Desert",…
## $ shaggy_va                <chr> "Casey Kasem", "Casey Kasem", "Casey Kasem", …
## $ snack_daphnie            <chr> "FALSE", "FALSE", "FALSE", "TRUE", "TRUE", "F…
## $ snack_fred               <chr> "TRUE", "FALSE", "TRUE", "FALSE", "FALSE", "T…
## $ snack_scooby             <chr> "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", …
## $ snack_shaggy             <chr> "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", …
## $ snack_velma              <chr> "FALSE", "TRUE", "FALSE", "FALSE", "FALSE", "…
## $ split_up                 <chr> "1", "0", "0", "1", "0", "0", "1", "0", "0", …
## $ suspects_amount          <chr> "2", "2", "0", "2", "1", "2", "1", "2", "1", …
## $ title                    <chr> "What a Night for a Knight", "A Clue for Scoo…
## $ trap_work_first          <chr> NA, "FALSE", "FALSE", "TRUE", NA, "TRUE", "FA…
## $ unmask_daphnie           <chr> "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", …
## $ unmask_fred              <chr> "FALSE", "TRUE", "TRUE", "TRUE", "FALSE", "TR…
## $ unmask_other             <chr> "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", …
## $ unmask_scooby            <chr> "TRUE", "FALSE", "FALSE", "FALSE", "TRUE", "F…
## $ unmask_shaggy            <chr> "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", …
## $ unmask_velma             <chr> "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", …
## $ velma_va                 <chr> "Nicole Jaffe", "Nicole Jaffe", "Nicole Jaffe…
## $ zoinks                   <chr> "1", "3", "1", "2", "0", "2", "1", "0", "0", …

3.1 Coltypes

Names of columns that appear logical at first glance

# specify columns
col_lgl <- d %>% 
  select(c(
    "arrested",
    "batman",
    "blue_falcon",
    "door_gag",
    "hex_girls",
    "monster_real",
    "non_suspect",
    "scooby_dum",
    "scrappy_doo",
    "trap_work_first",
    starts_with("captured_"),
    starts_with("caught_"),
    starts_with("snack_"),
    starts_with("unmask_")
  )) %>% 
  colnames()

# inspect sample
d %>% select(col_lgl) %>% slice_sample(n = 15)

Names of columns that appear numeric at first glance

# specify columns
col_num <- c(
  "another_mystery",
  "culprit_amount",
  "engagement",
  "groovy",
  "imdb",
  "index",
  "jeepers",
  "jinkies",
  "just_about_wrapped_up",
  "monster_amount",
  "my_glasses",
  "number_of_snacks",
  "rooby_rooby_roo",
  "run_time",
  "scooby_doo_where_are_you",
  "season",
  "set_a_trap",
  "split_up",
  "suspects_amount",
  "zoinks")

# inspect sample
d %>% select(col_num) %>% slice_sample(n = 15)

4 Wrangling

4.1 Coercion

4.1.1 Date

Parse date from character vector date_aired

# parse date
d %<>% mutate(aired_date = ymd(date_aired))

# verify result
d %>% 
  select(date_aired, aired_date) %>% 
  slice_sample(n = 15)
# drop original column
d %<>% select(-date_aired)

Extract date elements from aired_date

# get year and month
d %<>% mutate(
  aired_year = year(aired_date),
  aired_month = month(aired_date, label = TRUE)
) 

# get decade
d %<>% mutate(
  aired_decade = glue("{floor(aired_year/10) * 10}s")
) %>% as_factor()

# verify result
d %>% select(
    aired_date,
    aired_year,
    aired_month,
    aired_decade) %>% 
  slice_sample(n = 15)

4.1.2 Logical

Show all unique values in the columns that are presumed to be logical to manually verify this assumption.

# print unique values
d %>% 
  select(col_lgl) %>% 
  map(~unique(.x)) %>% 
  flatten_chr() %>% 
  unique()
## [1] "TRUE"  "FALSE" NA

There are indeed no other values than "TRUE"/"FALSE" or NA.

# convert to logical
d %<>% mutate(across(col_lgl, as.logical))

# verify result
d %>% select(col_lgl) %>% slice_sample(n = 15)

4.1.3 Numeric

Do the (presumed) numeric columns contain non-numeric characters?

# store unique values per column
uniq <- d %>% 
  select(col_num) %>% 
  map(~unique(.x)) 

# identify columns with non-numeric characters
non_num <- uniq %>%
  map( ~ str_detect(.x, "\\D") %>%
         replace_na(FALSE) %>%
         any()) %>%
  flatten_lgl()

# print unique values in columns with non-numeric characters
uniq[non_num]
## $imdb
##  [1] "8.1" "8"   "7.8" "7.5" "8.4" "7.6" "8.2" "8.5" "8.7" "8.3" "7.7" "7.9"
## [13] "6.9" "7.4" "7.2" "7.3" "7"   "6.3" "6.8" "7.1" "6.6" "6.4" "6"   "6.7"
## [25] "6.5" "6.2" "6.1" "5.8" "5.1" "5"   "5.9" "5.5" "5.3" "5.2" "5.4" "5.6"
## [37] "4.8" "4.6" "4.9" "8.8" "8.6" "9.2" "9.1" "8.9" "9"   "9.3" "9.6" "4.2"
## [49] "5.7" NA   
## 
## $number_of_snacks
##  [1] "2"                 "1"                 "3"                
##  [4] "4"                 "0"                 "1 box"            
##  [7] "a couple"          "6"                 "several"          
## [10] "3 boxes"           "truck load"        "5"                
## [13] "2 boxes"           "lifetime supply"   NA                 
## [16] "8"                 "10"                "wheel barrel full"
## [19] "several boxes"    
## 
## $season
## [1] "1"         "2"         "Crossover" "3"         "Movie"     "Special"  
## [7] "4"

The only non-numeric character in imdb is the decimal ., so this column can easily be converted to numeric. On the other hand, for season and number_of_snacks this doesn’t make sense.

# exclude season and number of snacks
col_num_final <- col_num[!(col_num %in% c("season", "number_of_snacks"))]

I like to keep the original columns (temporarily) for easy verification.

# convert to numeric and add suffix
d_num <- d %>% 
  select(all_of(col_num_final)) %>%  
  map_dfr(as.numeric) %>% 
  rename_with( ~ glue("{.x}_num"))

# bind converted data to original data
d %<>% bind_cols(d_num)

# sort columns alphabetically
d %<>% select(sort(colnames(.)))

# verify result
d %>% select(sort(c(
  col_num_final, glue("{col_num_final}_num")
))) %>% slice_sample(n = 15)
# drop old columns and remove suffix from new columns
d %<>%
  select(-(col_num_final)) %>%
  rename_with(~ str_remove(.x, "_num"))

4.2 Nested

Identify character columns containing ≥1 comma (might indicate nested data)

d_chr <- d %>% select(where(is.character))
d_chr[d_chr %>%
        map( ~ any(str_detect(.x, ","))) %>%
        flatten_lgl() %>%
        replace_na(FALSE)]

Create nested list columns where applicable

# note to self: not working, figure out why...
col_nested <-
  c(
    "culprit_gender",
    "culprit_name",
    "monster_gender",
    "monster_name",
    "monster_species",
    "monster_type",
    "monster_subtype"
  )

d %<>% mutate(
  across(
    col_nested, 
    ~ str_squish(str_split(.x, ","))
  )
)
# the non-elegant -but working- alternative
d$culprit_gender %<>% str_split(",")
d$culprit_name %<>% str_split(",")
d$monster_gender %<>% str_split(",")
d$monster_name %<>% str_split(",")
d$monster_species %<>% str_split(",")
d$monster_type %<>% str_split(",")
d$monster_subtype %<>% str_split(",")

5 Clean Data

Result of the data cleaning and wrangling.

d

6 Exploration

6.1 Skim

skim(d)
Data summary
Name d
Number of rows 603
Number of columns 78
_______________________
Column type frequency:
character 17
Date 1
factor 1
list 7
logical 33
numeric 19
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
aired_decade 0 1.0 5 5 0 7 0
and_that 528 0.1 3 80 0 64 0
daphnie_va 165 0.7 11 26 0 9 0
format 0 1.0 5 21 0 5 0
fred_va 219 0.6 9 18 0 5 0
if_it_wasnt_for 414 0.3 3 116 0 107 0
motive 67 0.9 4 16 0 27 0
network 0 1.0 3 20 0 11 0
number_of_snacks 1 1.0 1 17 0 18 0
scooby_va 28 0.9 10 12 0 5 0
season 0 1.0 1 9 0 7 0
series_name 0 1.0 4 42 0 29 0
setting_country_state 0 1.0 4 16 0 79 0
setting_terrain 0 1.0 3 8 0 15 0
shaggy_va 32 0.9 10 15 0 7 0
title 0 1.0 4 76 0 602 0
velma_va 218 0.6 9 19 0 12 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
aired_date 0 1 1969-09-13 2021-02-25 1988-09-10 448

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
aired_month 0 1 TRUE 12 okt: 158, sep: 105, nov: 96, dec: 56

Variable type: list

skim_variable n_missing complete_rate n_unique min_length max_length
culprit_gender 163 0.7 23 1 11
culprit_name 163 0.7 433 1 11
monster_gender 87 0.9 43 1 19
monster_name 87 0.9 481 1 17
monster_species 87 0.9 194 1 19
monster_subtype 88 0.8 269 1 19
monster_type 87 0.9 132 1 19

Variable type: logical

skim_variable n_missing complete_rate mean count
arrested 155 0.7 0.85 TRU: 381, FAL: 67
batman 0 1.0 0.01 FAL: 599, TRU: 4
blue_falcon 0 1.0 0.05 FAL: 570, TRU: 33
captured_daphnie 165 0.7 0.21 FAL: 347, TRU: 91
captured_fred 219 0.6 0.18 FAL: 313, TRU: 71
captured_scooby 28 0.9 0.14 FAL: 492, TRU: 83
captured_shaggy 32 0.9 0.15 FAL: 486, TRU: 85
captured_velma 218 0.6 0.19 FAL: 311, TRU: 74
caught_daphnie 165 0.7 0.07 FAL: 409, TRU: 29
caught_fred 219 0.6 0.34 FAL: 252, TRU: 132
caught_not 0 1.0 0.05 FAL: 572, TRU: 31
caught_other 0 1.0 0.14 FAL: 519, TRU: 84
caught_scooby 28 0.9 0.28 FAL: 415, TRU: 160
caught_shaggy 32 0.9 0.13 FAL: 494, TRU: 77
caught_velma 218 0.6 0.11 FAL: 344, TRU: 41
door_gag 0 1.0 0.10 FAL: 544, TRU: 59
hex_girls 0 1.0 0.01 FAL: 597, TRU: 6
monster_real 87 0.9 0.22 FAL: 404, TRU: 112
non_suspect 160 0.7 0.10 FAL: 397, TRU: 46
scooby_dum 0 1.0 0.03 FAL: 586, TRU: 17
scrappy_doo 0 1.0 0.27 FAL: 438, TRU: 165
snack_daphnie 165 0.7 0.11 FAL: 389, TRU: 49
snack_fred 219 0.6 0.05 FAL: 366, TRU: 18
snack_scooby 27 1.0 0.02 FAL: 564, TRU: 12
snack_shaggy 31 0.9 0.08 FAL: 529, TRU: 43
snack_velma 218 0.6 0.08 FAL: 356, TRU: 29
trap_work_first 354 0.4 0.50 TRU: 125, FAL: 124
unmask_daphnie 165 0.7 0.08 FAL: 401, TRU: 37
unmask_fred 219 0.6 0.27 FAL: 282, TRU: 102
unmask_other 0 1.0 0.06 FAL: 568, TRU: 35
unmask_scooby 28 0.9 0.04 FAL: 552, TRU: 23
unmask_shaggy 32 0.9 0.02 FAL: 558, TRU: 13
unmask_velma 218 0.6 0.24 FAL: 291, TRU: 94

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
aired_year 0 1.0 2e+03 2e+01 1969 1979 1988 2012 2e+03 ▇▇▁▅▇
another_mystery 219 0.6 2e-01 4e-01 0 0 0 0 3e+00 ▇▂▁▁▁
culprit_amount 0 1.0 1e+00 1e+00 0 0 1 1 1e+01 ▇▁▁▁▁
engagement 15 1.0 6e+02 5e+03 7 27 54 128 1e+05 ▇▁▁▁▁
groovy 32 0.9 6e-02 6e-01 0 0 0 0 1e+01 ▇▁▁▁▁
imdb 15 1.0 7e+00 7e-01 4 7 7 8 1e+01 ▁▁▇▆▁
index 0 1.0 3e+02 2e+02 1 152 302 452 6e+02 ▇▇▇▇▇
jeepers 165 0.7 6e-01 1e+00 0 0 0 1 1e+01 ▇▁▁▁▁
jinkies 218 0.6 1e+00 2e+00 0 0 1 2 1e+01 ▇▁▁▁▁
just_about_wrapped_up 218 0.6 5e-02 2e-01 0 0 0 0 1e+00 ▇▁▁▁▁
monster_amount 0 1.0 2e+00 2e+00 0 1 1 2 2e+01 ▇▁▁▁▁
my_glasses 218 0.6 1e-01 3e-01 0 0 0 0 2e+00 ▇▁▁▁▁
rooby_rooby_roo 28 0.9 7e-01 8e-01 0 0 1 1 7e+00 ▇▁▁▁▁
run_time 0 1.0 2e+01 2e+01 4 12 22 23 9e+01 ▇▃▁▁▁
scooby_doo_where_are_you 32 0.9 1e-01 4e-01 0 0 0 0 4e+00 ▇▁▁▁▁
set_a_trap 219 0.6 1e-01 3e-01 0 0 0 0 2e+00 ▇▁▁▁▁
split_up 219 0.6 3e-01 6e-01 0 0 0 1 2e+00 ▇▁▃▁▁
suspects_amount 0 1.0 3e+00 3e+00 0 1 3 4 2e+01 ▇▂▁▁▁
zoinks 32 0.9 2e+00 3e+00 0 0 1 3 3e+01 ▇▁▁▁▁

7 Visualization

7.1 Monsters are male

plotdata <- d %>%
  select(aired_decade, monster_gender) %>%
  unnest(monster_gender) %>%
  group_by(aired_decade, monster_gender) %>%
  filter(monster_gender %in% c("Male", "Female")) %>%
  tally()

plotdata %>% slice_sample(n = 10)
ggplot(plotdata, aes(
  fill = as.factor(monster_gender),
  y = n,
  x = as.factor(aired_decade)
)) +
  geom_bar(position = "fill", stat = "identity")

… and so are culprits!

gd <- d %>% 
  select(aired_year, culprit_gender) %>% 
  unnest(culprit_gender) %>% 
  mutate(culprit_gender = str_squish(culprit_gender)) %>% 
  group_by(aired_year, culprit_gender) %>% 
  filter(!(culprit_gender %in% c("", "None"))) %>% 
  tally()

ggplot(gd, aes(
  fill = as.factor(culprit_gender),
  y = n,
  x = as.factor(aired_year)
)) + geom_bar(position = "fill", stat = "identity")

7.2

gd <- d %>%
  select(starts_with("unmask_")) %>%
  rename_with( ~ str_remove(.x, "unmask_")) %>%
  pivot_longer(1:6, names_to = "name") %>% 
  group_by(name, value) %>% 
  filter(!is.na(value)) %>% 
  tally()

ggplot(gd, aes(x="", y = n, fill = value)) +
  geom_bar(stat="identity", width=1) +
  coord_polar("y", start = 0) + 
  facet_wrap(~ name, ncol = 2) +
  theme_void()